conda install pandas
Collecting package metadata (current_repodata.json): done
Solving environment: done
==> WARNING: A newer version of conda exists. <==
current version: 4.10.3
latest version: 23.3.1
Please update conda by running
$ conda update -n base -c defaults conda
# All requested packages already installed.
Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import os
operations = pd.read_csv("OperationsAssociateRawFile.csv")
print(operations)
patient_id provider_id insurance_type dob \
0 46344 24 Zeta 1981-07-02T00:00:00.000Z
1 46999 24 Beta Blues 1983-04-04T00:00:00.000Z
2 47198 24 Beta Blues 1983-04-04T00:00:00.000Z
3 48294 24 Beta Blues 1984-05-10T00:00:00.000Z
4 49648 24 Beta Blues 1966-12-10T00:00:00.000Z
... ... ... ... ...
65878 128707 13182 Alpha NorCal 1986-08-15T00:00:00.000Z
65879 127456 13203 Alpha NorCal 2011-10-21T00:00:00.000Z
65880 127638 13203 Alpha NorCal 2013-01-27T00:00:00.000Z
65881 127976 13203 Alpha NorCal 1968-08-08T00:00:00.000Z
65882 129117 13267 Cash 1998-06-03T00:00:00.000Z
gender date_created booked_from state utm_campaign \
0 Male 2022-07-03T23:19:16.743Z grow FL 1.39E+11
1 Female 2022-07-06T15:10:32.063Z grow FL NaN
2 Female 2022-07-06T20:39:39.774Z grow FL 1.39E+11
3 Female 2022-07-11T10:19:08.470Z grow FL 1.39E+11
4 Female 2022-07-14T16:38:29.598Z grow FL NaN
... ... ... ... ... ...
65878 Female 2022-12-30T04:25:55.105Z grow CA 35-email-prov
65879 Female 2022-12-28T16:25:21.364Z grow CA 1-standard-profile
65880 Female 2022-12-28T19:33:13.220Z grow CA 1-standard-profile
65881 Male 2022-12-29T04:42:21.056Z grow CA NaN
65882 Female 2022-12-30T23:00:45.425Z grow NY 1.32E+11
utm_source utm_medium
0 google cpc
1 grow organic
2 google cpc
3 google cpc
4 grow organic
... ... ...
65878 Alpha email
65879 Alpha text
65880 Alpha email
65881 grow organic
65882 google cpc
[65883 rows x 11 columns]
duplicates = operations.duplicated(subset=['patient_id'])
print('Number of duplicates in patient_id column:', duplicates.sum())
Number of duplicates in patient_id column: 0
unique_utm_medium = operations['utm_medium'].unique()
print(unique_utm_medium)
['cpc' 'organic' 'booking-link' 'api' 'text' 'email' 'call' 'profile-link' 'switch' 'form' 'email for scheduling.' 'affliate' 'referral' 'Yext' "email'"]
count_utm_medium = operations['utm_medium'].value_counts()
print(count_utm_medium)
#api is used the most
api 21971 organic 15911 email 9596 cpc 7590 text 4916 call 2480 booking-link 1719 switch 1177 profile-link 499 affliate 18 Yext 2 form 1 email for scheduling. 1 referral 1 email' 1 Name: utm_medium, dtype: int64
count_utm_medium = operations['utm_medium'].value_counts()
count_utm_medium.plot(kind='barh', color='skyblue')
plt.title('Count of UTM Medium')
plt.xlabel('Count')
plt.ylabel('UTM Medium')
plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) /var/folders/bl/5kk4pcgd6vqdpfcs3h7lpqn80000gn/T/ipykernel_1283/465834050.py in <module> 1 count_utm_medium = operations['utm_medium'].value_counts() 2 count_utm_medium.plot(kind='barh', color='skyblue') ----> 3 plt.title('Count of UTM Medium') 4 plt.xlabel('Count') 5 plt.ylabel('UTM Medium') NameError: name 'plt' is not defined
unique_utm_source = operations['utm_source'].unique()
print(unique_utm_source)
['google' 'grow' 'healthgrades' 'zocdoc' 'unk-source' 'grow-reclaim' 'mhm' 'psychology-today' 'Zeta -directory' 'choosingtherapy' 'bing' 'pt' 'office' 'therapy-for-black-girls' 'therapyden' 'facebook' 'caredash' 'provider-sourced' 'GetWell-health-directory' 'Alpha' 'humana-directory' 'aircall' 'optum' 'mental-health-match' 'zencare' 'c-directory' 'bcbs-directory' 'tiktok' 'optum-directory' 'therapy-for-black-men' 'therapy-tribe' 'Gamma-directory' 'latinx' 'univ-ill-bcbs' 'grow-rematch' 'choosing-therapy' 'therapy-for-latinx' 'stackadapt' 'tfbm' 'gmb' 'yext' 'sendgrid' 'sendinblue' 'choosing_therapy' 'talkiatry' 'extnet' 'google-adc' 'good-therapy' 'hs_email' 'hs_automation']
# Get the count of each unique value in the 'utm_source' column
utm_source_counts = operations['utm_source'].value_counts()
print(utm_source_counts)
# Zocdoc has the most
zocdoc 22045 grow 16134 Alpha 9335 google 6302 psychology-today 2299 pt 1526 unk-source 1333 office 1259 grow-reclaim 985 bing 938 provider-sourced 855 therapy-for-black-girls 471 c-directory 358 mhm 274 facebook 204 tiktok 180 optum 171 bcbs-directory 139 healthgrades 134 GetWell-health-directory 126 therapy-for-black-men 114 zencare 104 caredash 94 aircall 92 choosingtherapy 61 mental-health-match 56 Zeta -directory 55 Gamma-directory 53 hs_automation 45 therapyden 42 grow-rematch 21 choosing_therapy 18 humana-directory 18 optum-directory 9 univ-ill-bcbs 6 sendgrid 6 choosing-therapy 4 therapy-for-latinx 2 extnet 2 good-therapy 2 hs_email 2 stackadapt 1 tfbm 1 latinx 1 yext 1 sendinblue 1 therapy-tribe 1 talkiatry 1 google-adc 1 gmb 1 Name: utm_source, dtype: int64
import plotly.graph_objects as go
import matplotlib.pyplot as plt
!pip install squarify
import squarify
Requirement already satisfied: squarify in ./opt/anaconda3/lib/python3.9/site-packages (0.4.3)
# Set up data
utm_source_counts = operations['utm_source'].value_counts()
# Define color for bars
bar_color = '#b48c1f'
# Create bar plot
fig, ax = plt.subplots(figsize=(10,6))
plt.bar(utm_source_counts.index, utm_source_counts.values, color=bar_color)
# Rotate x-axis labels
plt.xticks(rotation=90)
# Set plot title and axis labels
plt.title('Number of Observations by UTM Source')
plt.xlabel('UTM Source')
plt.ylabel('Number of tracking of sources')
plt.show()
# Get a list of all unique values in the 'utm_campaign' column
utm_campaign_types = operations['utm_campaign'].unique()
# Print the list of unique values
print(utm_campaign_types)
['1.39E+11' nan '1-standard-profile' '12-zendesk' '5-abandoned-link-grow' '1.32E+15' '32-intercom' '1.32E+11' '1.36E+11' '1.33E+11' '1.35E+11' '1.42E+11' 'f1039|grow-therapy|client|rmk|conv|all|a|---|ca|booking-page-visitors' '16-internal-referral' '1.33E+15' '6-abandoned-link-zocdoc' 'intercom' '1.44E+11' '1.31E+15' '33-grow-cancel' 'f1038|grow-therapy|client|pst|conv|florida|a|18-45|lal|appointment-confirmation-0-120-0-1' 'tik1005|grow-therapy|client|pst|conv|florida|f|18-34|vi|health-wellness-video-interactions' 'tik1006|grow-therapy|client|pst|conv|florida|a|18-34|lal|clicks-6-sec-vid-view' 'tik1002|grow-therapy|client|pst|conv|florida|f|18-34|int|self-care-interests' '26-grow-rematch' '1.38E+11' '29-abandoned-first-apt' 'f1036|grow-therapy|client|rmk|conv|all|a|---|ca|site-visitors-started-booking-event' '35-email-prov' 'tik1008|grow-therapy|client|pst|conv|florida|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions' 'client_share' 'f1045|grow-therapy|client|pst|conv|florida|a|18-45|lal|appointment-confirmation-0-120-0-1' 'f1042|grow-therapy|client|pst|conv|florida|f|18-24|dit|therapy-self-care-interests' 'sa1005|grow-therapy|client|pst|display|conv|florida|all|---|pcai|therapy-counseling-pcai' 'tik1020|grow-therapy|client|pst|conv|virginia|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions' 'f1041|grow-therapy|client|pst|conv|pennsylvania|a|18-45|lal|appointment-confirmation-0-120-0-1' 'f1059|grow-therapy|client|pst|conv|tx-ga-pa-ct-md-in|a|18-45|lal|appointment-confirmation-0-120-0-1' 'tik1017|grow-therapy|client|pst-test|conv|fl-tx|f|18-34|int-vi|interests-video-interactions-homepage-url-test' '13-gmb-trial' 'f1040|grow-therapy|client|pst|conv|georgia|a|18-45|lal|appointment-confirmation-0-120-0-1' 'tik1018|grow-therapy|client|pst-exp-aud-test|conv|fl-tx|f|18-34|int-vi|interests-video-interactions-control-audience' '1.43E+11' '1-standard-profile https://provider.growtherapy.com/book-appointment?id=974' '1.41E+11' 'tik1010|grow-therapy|client|pst|conv|texas|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions' 'mham_beforeapril16' 'tik1009|grow-therapy|client|pst|conv|pennsylvania|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions' '1-standard-profile"' 'f1048|grow-therapy|client|pst|conv|florida|f|18-24|dit|therapy-self-care-interests' 'f1054|grow-therapy|client|pst|conv|texas|a|18-45|lal|appointment-confirmation-0-120-0-1' 'tik1004|grow-therapy|client|pst|conv|pennsylvania|f|18-34|int|self-care-interests' 'f1032|grow-therapy|client|rmk|conv|all|a|---|ca|booking-page-visitors' '1-standard-profilehttps://provider.growtherapy.com/book-appointment?id=1589' 'Reschedule_Request_Individual_links_Grow_1' 'email_name' '24-talkiatry-test' 'tik1022|grow-therapy|client|pst|conv|illinois|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions' 'mham_afterapril16' 'tik1003|grow-therapy|client|pst|conv|georgia|f|18-34|int|self-care-interests' 'tik1015|grow-therapy|client|rmk|conv|all|a|---|ca|clicks-6-sec-vid-view-site-visitors' '1-standard-' 'tik1013|grow-therapy|client|pst|conv|maryland|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions' 'tik1014|grow-therapy|client|pst|conv|connecticut|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions' '12-zendesk/' 'tik1016|grow-therapy|client|pst-test|conv|fl-tx|f|18-34|int-vi|interests-video-interactions-filters-url-test' 'tik1007|grow-therapy|client|rmk|conv|all|a|---|ca|clicks-6-sec-vid-view' 'tik1019|grow-therapy|client|pst-exp-aud-test|conv|fl-tx|f|18-34|int-vi|interests-video-interactions-expanded-targeting-audience' 'tik1011|grow-therapy|client|pst|conv|indiana|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions' 'f1033|grow-therapy|client|ret|conv|all|a|---|ca|appointment-confirmation-visitors' '1-standard-profilepsychoatrist that accept anthem' 'f1052|grow-therapy|client|pst|conv|pennsylvania|f|18-24|dit|therapy-self-care-interests' 'tik1021|grow-therapy|client|rmk|conv|all|a|---|ca|site-visitors-100%-views' 'f1051|grow-therapy|client|pst|conv|georgia|f|18-24|dit|therapy-self-care-interests' '1.34E+11' '1.45E+11' 'f1060|grow-therapy|client|rmk|conv|all|a|---|ca|booking-page-visitors' '1.48E+11']
# Get the count of each unique value in the 'utm_campaign' column
utm_campaign_count = operations['utm_campaign'].value_counts()
# Print the count of each unique value
print(utm_campaign_count)
# 1-stabndard profikle has the most
1-standard-profile 32788
12-zendesk 6565
1.32E+11 1724
1.39E+11 1704
16-internal-referral 825
...
12-zendesk/ 1
tik1013|grow-therapy|client|pst|conv|maryland|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions 1
1-standard- 1
Reschedule_Request_Individual_links_Grow_1 1
1.48E+11 1
Name: utm_campaign, Length: 76, dtype: int64
# Convert the date_created column to a datetime data type
operations['date_created'] = pd.to_datetime(operations['date_created'])
# Group the data by month and count the number of unique patient IDs
monthly_patients = operations.groupby(pd.Grouper(key='date_created', freq='M'))['patient_id'].nunique()
# Print the resulting series
print(monthly_patients)
#number of patients that registered per month
date_created 2022-07-31 00:00:00+00:00 7627 2022-08-31 00:00:00+00:00 10028 2022-09-30 00:00:00+00:00 11054 2022-10-31 00:00:00+00:00 12049 2022-11-30 00:00:00+00:00 13028 2022-12-31 00:00:00+00:00 12097 Freq: M, Name: patient_id, dtype: int64
import plotly.express as px
# Convert the date_created column to a datetime data type
operations['date_created'] = pd.to_datetime(operations['date_created'])
# Group the data by month and count the number of unique patient IDs
monthly_patients = operations.groupby(pd.Grouper(key='date_created', freq='M'))['patient_id'].nunique().reset_index()
monthly_patients['month_year'] = monthly_patients['date_created'].dt.strftime('%Y-%b')
# Create a treemap figure
fig = px.treemap(monthly_patients, path=['month_year'], values='patient_id')
# Add the counts to the treemap labels
fig.data[0].textinfo = 'label+text+value'
fig.data[0].textfont.size = 25
# Show the figure
fig.show()
# Convert the date_created column to a datetime format
operations["date_created"] = pd.to_datetime(operations["date_created"])
# Extract the day of the week from the date_created column
operations["day_of_week"] = operations["date_created"].dt.day_name()
# Count the occurrences of each day of the week
day_counts = operations["day_of_week"].value_counts()
# Print the most popular day of the week
print("The most popular day of the week patients registered for is:", day_counts.index[0])
The most popular day of the week patients registered for is: Tuesday
import pandas as pd
import plotly.express as px
# Define an ordered list of weekdays
ordered_weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
# Sort the day_of_week column based on the ordered list
operations['day_of_week'] = pd.Categorical(operations['day_of_week'], categories=ordered_weekdays, ordered=True)
# Count the occurrences of each day of the week and sort the resulting Series
day_counts = operations['day_of_week'].value_counts().sort_index()
# Create a DataFrame with the day names and their counts
df = pd.DataFrame({'Day of Week': day_counts.index, 'Count': day_counts.values})
# Create a Bubble Chart with a Colorscale
fig = px.scatter(df, x='Day of Week', y='Count', size='Count', color='Count',
color_continuous_scale='reds')
# Set the chart title and axis labels
fig.update_layout(title='Patient Registrations by Day of Week',
xaxis_title='Day of Week',
yaxis_title='Number of Registrations')
# Show the chart
fig.show()
# group the patients by insurance type and count the number of unique patient ids
insurance_counts = operations.groupby('insurance_type')['patient_id'].nunique()
# calculate the total number of patients
total_patients = operations['patient_id'].nunique()
# calculate the percentage of patients for each insurance company
insurance_percentages = insurance_counts / total_patients * 100
# find the insurance company with the highest percentage of patients
highest_percentage = insurance_percentages.idxmax()
print("The insurance company with the highest percentage of patients is:", highest_percentage)
The insurance company with the highest percentage of patients is: Beta Blues
import plotly.express as px
# group the patients by insurance type and count the number of unique patient ids
insurance_counts = operations.groupby('insurance_type')['patient_id'].nunique()
# calculate the total number of patients
total_patients = operations['patient_id'].nunique()
# calculate the percentage of patients for each insurance company
insurance_percentages = insurance_counts / total_patients * 100
# filter for insurance groups that have more than 1%
insurance_percentages_filtered = insurance_percentages[insurance_percentages > 1]
# create a DataFrame with the filtered percentages and insurance types
df = pd.DataFrame({'Insurance Type': insurance_percentages_filtered.index,
'Percentage': insurance_percentages_filtered.values})
# create a pie chart with a color gradient
fig = px.pie(df, values='Percentage', names='Insurance Type',
color='Insurance Type', color_discrete_sequence=px.colors.qualitative.Safe)
# set the chart title
fig.update_layout(title='Percentage of Insurance Types')
# show the chart
fig.show()
lowest_insurance_company = operations.groupby('insurance_type')['patient_id'].nunique() / operations['patient_id'].nunique()
lowest_insurance_company = lowest_insurance_company.idxmin()
print("The insurance company with the lowest percentage of patients is:", lowest_insurance_company)
The insurance company with the lowest percentage of patients is: ANTHEM
states = operations['state'].unique()
print(states)
['FL' 'NH' 'TX' 'CA' 'SC' 'IN' 'NY' 'GA' 'OH' 'CT' 'PR' 'KY' 'IL' 'DE' 'VA' 'NE' 'PA' 'VT' 'CO' 'AK' 'Florida' 'OR' nan 'WY' 'MA' 'WA' 'NC' 'MD' 'WI' 'NJ' 'RI' 'TN' 'MI' 'AL' 'DC' 'HI' 'MN' 'ME' 'AZ' 'OK' 'LA' 'UT' 'VI' 'MS' 'NM' 'KS' 'MO' 'NV' 'AR' 'MT' 'ID' 'IA' 'SD' 'WV']
florida_count = (operations['state'] == 'Florida').sum()
print("Number of times Florida appears in booked_fromstate column:", florida_count)
Number of times Florida appears in booked_fromstate column: 2
operations["state"] = operations["state"].replace("Florida", "FL")
states = operations['state'].unique()
print(states)
['FL' 'NH' 'TX' 'CA' 'SC' 'IN' 'NY' 'GA' 'OH' 'CT' 'PR' 'KY' 'IL' 'DE' 'VA' 'NE' 'PA' 'VT' 'CO' 'AK' 'OR' nan 'WY' 'MA' 'WA' 'NC' 'MD' 'WI' 'NJ' 'RI' 'TN' 'MI' 'AL' 'DC' 'HI' 'MN' 'ME' 'AZ' 'OK' 'LA' 'UT' 'VI' 'MS' 'NM' 'KS' 'MO' 'NV' 'AR' 'MT' 'ID' 'IA' 'SD' 'WV']
num_unique_states = operations['state'].nunique()
print(f"There are {num_unique_states} unique states in the operations dataframe.")
There are 52 unique states in the operations dataframe.
# Check for duplicates in the states column
duplicates = operations['state'].duplicated()
# Print the results
if duplicates.any():
print("There are duplicates in the states column")
else:
print("There are no duplicates in the states column")
There are duplicates in the states column
# Group the dataframe by state and count the number of patients in each state
state_counts = operations.groupby('state')['patient_id'].count()
# Find the state with the highest number of patients
highest_state = state_counts.idxmax()
highest_count = state_counts.loc[highest_state]
# Find the state with the lowest number of patients
lowest_state = state_counts.idxmin()
lowest_count = state_counts.loc[lowest_state]
# Print the results
print(f"The state with the highest number of patients is {highest_state} with {highest_count} patients.")
print(f"The state with the lowest number of patients is {lowest_state} with {lowest_count} patients.")
The state with the highest number of patients is FL with 16506 patients. The state with the lowest number of patients is VI with 2 patients.
import plotly.express as px
# Group the patients by state and count the number of patients in each state
state_counts = operations.groupby('state')['patient_id'].count().reset_index()
# Find the state with the highest number of patients
highest_state = state_counts.loc[state_counts['patient_id'].idxmax(), 'state']
highest_count = state_counts.loc[state_counts['patient_id'].idxmax(), 'patient_id']
# Find the state with the lowest number of patients
lowest_state = state_counts.loc[state_counts['patient_id'].idxmin(), 'state']
lowest_count = state_counts.loc[state_counts['patient_id'].idxmin(), 'patient_id']
# Create a US map figure
fig = px.choropleth(state_counts, locations='state', locationmode='USA-states',
color='patient_id', scope='usa', color_continuous_scale='YlOrRd')
# Customize the figure
fig.update_layout(title='Patient Counts by State',
annotations=[
dict(
x=0.5,
y=-0.1,
xref='paper',
yref='paper',
showarrow=False,
text=f'Highest count in {highest_state}: {highest_count} patients | Lowest count in {lowest_state}: {lowest_count} patients'
)
])
# Show the figure
fig.show()
booked_from_counts = operations['booked_from'].value_counts()
print(booked_from_counts)
grow 43889 zocdoc 21994 Name: booked_from, dtype: int64
import pandas as pd
import matplotlib.pyplot as plt
# Get the value counts for the 'booked_from' column
booked_from_counts = operations['booked_from'].value_counts()
# Create a DataFrame with the counts and the corresponding index values
df = pd.DataFrame({'booked_from': booked_from_counts.index, 'count': booked_from_counts.values})
# Sort the DataFrame by count values in descending order
df = df.sort_values('count', ascending=False)
# Create the lollipop plot
plt.stem(df['booked_from'], df['count'], basefmt=' ')
# Set the x-axis label and title
plt.xlabel('booked_from')
plt.title('Counts of Booked From')
# Show the plot
plt.show()
unique_providers = operations['provider_id'].unique()
print(unique_providers)
[ 24 37 38 ... 13182 13203 13267]
# group patients by provider and count number of patients for each provider
provider_counts = operations.groupby('provider_id')['patient_id'].count()
# get provider with highest and lowest patient counts
max_provider = provider_counts.idxmax()
min_provider = provider_counts.idxmin()
print(f"The provider with the most patients is {max_provider} with {provider_counts[max_provider]} patients.")
print(f"The provider with the least patients is {min_provider} with {provider_counts[min_provider]} patients.")
The provider with the most patients is 3244 with 441 patients. The provider with the least patients is 38 with 1 patients.
provider_patient_counts = operations.groupby('provider_id')['patient_id'].nunique()
print(provider_patient_counts)
provider_id
24 29
37 43
38 1
44 21
45 18
..
13148 1
13155 1
13182 5
13203 3
13267 1
Name: patient_id, Length: 3287, dtype: int64
duplicate_counts = operations.duplicated(subset=['provider_id'], keep=False).groupby(operations['provider_id']).sum().astype(int)
print(duplicate_counts)
provider_id
24 29
37 43
38 0
44 21
45 18
..
13148 0
13155 0
13182 5
13203 3
13267 0
Length: 3287, dtype: int64
import pandas as pd
import plotly.express as px
# calculate the number of duplicates per provider
duplicate_counts = operations.duplicated(subset=['provider_id'], keep=False).groupby(operations['provider_id']).sum().astype(int)
# sort providers by number of duplicates in descending order and select top 5
top_providers = duplicate_counts.sort_values(ascending=False).head(5)
# create a dataframe with the selected providers and their duplicate counts
df = pd.DataFrame({'Provider ID': top_providers.index, 'Duplicate Count': top_providers.values})
# create a stacked donut chart
fig = px.sunburst(df, path=['Provider ID'], values='Duplicate Count', color_discrete_sequence=px.colors.qualitative.Pastel)
# set chart title
fig.update_layout(title='Top 5 Providers with the Most Duplicates')
# show the chart
fig.show()
provider_duplicate_counts = operations.duplicated(subset=['provider_id'], keep=False).groupby(operations['provider_id']).sum().astype(int)
print(provider_duplicate_counts)
provider_id
24 29
37 43
38 0
44 21
45 18
..
13148 0
13155 0
13182 5
13203 3
13267 0
Length: 3287, dtype: int64
import matplotlib.pyplot as plt
# Count number of occurrences for each provider ID
provider_counts = operations['provider_id'].value_counts()
# Get top 10 providers with highest number of occurrences
top_providers = provider_counts.head(10)
# Create pie chart
fig, ax = plt.subplots()
ax.pie(top_providers.values, labels=top_providers.index, autopct='%1.1f%%', startangle=90)
# Set title
ax.set_title('Top 10 Providers with Highest Number of Occurrences')
# Show plot
plt.show()
gender_counts = operations['gender'].value_counts()
most_common_gender = gender_counts.idxmax()
print("The most common gender among patients is:", most_common_gender)
The most common gender among patients is: Female
gender_counts = operations['gender'].value_counts()
print(gender_counts)
Female 44483 Male 20317 Other 764 Decline to Specify 312 female 4 male 2 Unknown 1 Name: gender, dtype: int64
operations["gender"] = operations["gender"].replace("female", "Female")
operations["gender"] = operations["gender"].replace("male", "Male")
gender_counts = operations['gender'].value_counts()
print(gender_counts)
Female 44487 Male 20319 Other 764 Decline to Specify 312 Unknown 1 Name: gender, dtype: int64
import matplotlib.pyplot as plt
# Get gender counts
gender_counts = operations['gender'].value_counts()
# Create bar graph
fig, ax = plt.subplots()
ax.bar(gender_counts.index, gender_counts.values)
# Set axis labels and title
ax.set_xlabel('Gender')
ax.set_ylabel('Number of Patients')
ax.set_title('Patient Gender Distribution')
# Display graph
plt.show()
# Convert dob column to timezone-aware datetime object
operations['dob'] = pd.to_datetime(operations['dob'])
# Calculate age by subtracting dob from current date
now = datetime.now()
operations['age'] = (now.date() - operations['dob'].dt.date).astype('<m8[Y]')
# Calculate the average age
avg_age = operations['age'].mean()
print(f"The average age of patients is {avg_age:.2f} years")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) /var/folders/bl/5kk4pcgd6vqdpfcs3h7lpqn80000gn/T/ipykernel_1283/883925839.py in <module> 3 4 # Calculate age by subtracting dob from current date ----> 5 now = datetime.now() 6 operations['age'] = (now.date() - operations['dob'].dt.date).astype('<m8[Y]') 7 NameError: name 'datetime' is not defined
from datetime import datetime
# Convert dob column to timezone-aware datetime object
operations['dob'] = pd.to_datetime(operations['dob'])
# Calculate age by subtracting dob from current date
now = datetime.now()
operations['age'] = (now.date() - operations['dob'].dt.date).astype('<m8[Y]')
# Calculate the average age
avg_age = operations['age'].mean()
print(f"The average age of patients is {avg_age:.2f} years")
# Calculate the average gender
avg_gender = operations['gender'].mode()[0]
print(f"The average gender of patients is {avg_gender}")
The average age of patients is 31.30 years The average gender of patients is Female
import plotly.express as px
# Create a copy of the operations dataframe with only age and gender columns
age_gender_df = operations[['age', 'gender']].copy()
# Group the dataframe by age and gender and count the number of patients in each group
age_gender_counts = age_gender_df.groupby(['age', 'gender']).size().reset_index(name='count')
# Plot the bubble chart
fig = px.scatter(age_gender_counts, x='age', y='count', size='count', color='gender', title='Distribution of Age and Gender of Patients')
fig.show()
# Find the minimum and maximum age by gender
age_by_gender = operations.groupby('gender')['age'].agg(['min', 'max'])
# Print the results
print(f"The oldest patient is {age_by_gender.loc[age_by_gender['max'].idxmax()]['max']:.0f} years old and is {age_by_gender['max'].idxmax()}")
print(f"The youngest patient is {age_by_gender.loc[age_by_gender['min'].idxmin()]['min']:.0f} years old and is {age_by_gender['min'].idxmin()}")
The oldest patient is 99 years old and is Female The youngest patient is 2 years old and is Female
!pip install shap
Requirement already satisfied: shap in ./opt/anaconda3/lib/python3.9/site-packages (0.41.0) Requirement already satisfied: slicer==0.0.7 in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (0.0.7) Requirement already satisfied: packaging>20.9 in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (21.0) Requirement already satisfied: pandas in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (1.3.4) Requirement already satisfied: numba in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (0.54.1) Requirement already satisfied: scikit-learn in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (0.24.2) Requirement already satisfied: scipy in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (1.7.1) Requirement already satisfied: cloudpickle in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (2.0.0) Requirement already satisfied: numpy in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (1.20.3) Requirement already satisfied: tqdm>4.25.0 in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (4.62.3) Requirement already satisfied: pyparsing>=2.0.2 in ./opt/anaconda3/lib/python3.9/site-packages (from packaging>20.9->shap) (3.0.4) Requirement already satisfied: setuptools in ./opt/anaconda3/lib/python3.9/site-packages (from numba->shap) (58.0.4) Requirement already satisfied: llvmlite<0.38,>=0.37.0rc1 in ./opt/anaconda3/lib/python3.9/site-packages (from numba->shap) (0.37.0) Requirement already satisfied: python-dateutil>=2.7.3 in ./opt/anaconda3/lib/python3.9/site-packages (from pandas->shap) (2.8.2) Requirement already satisfied: pytz>=2017.3 in ./opt/anaconda3/lib/python3.9/site-packages (from pandas->shap) (2021.3) Requirement already satisfied: six>=1.5 in ./opt/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas->shap) (1.16.0) Requirement already satisfied: threadpoolctl>=2.0.0 in ./opt/anaconda3/lib/python3.9/site-packages (from scikit-learn->shap) (2.2.0) Requirement already satisfied: joblib>=0.11 in ./opt/anaconda3/lib/python3.9/site-packages (from scikit-learn->shap) (1.1.0)
import shap
import matplotlib.pyplot as plt
# Data
count_utm_medium = {'api': 21971, 'organic': 15911, 'email': 9596, 'cpc': 7590, 'text': 4916, 'call': 2480, 'booking-link': 1719, 'switch': 1177, 'profile-link': 499, 'affliate': 18, 'Yext': 2, 'form': 1, 'email for scheduling.': 1, 'referral': 1, "email'": 1}
# Define threshold
threshold = 2.5
# Get total count
total_count = sum(count_utm_medium.values())
# Compute percentages and filter labels
labels = []
sizes = []
colors = ['#aebfff', '#80df56', '#009E73', '#F0E442', '#0072B2', '#00eacc', '#769d23']
for label, count in count_utm_medium.items():
percent = 100 * count / total_count
if percent >= threshold:
labels.append(f"{label}\n({percent:.1f}%)")
sizes.append(count)
if label == 'api':
colors.append('tab:blue')
elif label == 'organic':
colors.append('tab:orange')
elif label == 'email':
colors.append('tab:green')
else:
colors.append('tab:gray')
# Set up the plot
fig, ax = plt.subplots()
ax.axis('equal')
# Draw the pie chart
wedges, texts, _ = ax.pie(sizes, wedgeprops=dict(width=0.5), startangle=-40, autopct='', colors=colors)
# Set the labels and add a legend
ax.legend(wedges, labels, loc="best", bbox_to_anchor=(0.9, 0.9))
# Set the title
ax.set_title("UTM Medium Distribution")
plt.show()
import matplotlib.pyplot as plt
# Data
count_utm_medium = {'api': 21971, 'organic': 15911, 'email': 9596, 'cpc': 7590, 'text': 4916, 'call': 2480, 'booking-link': 1719, 'switch': 1177, 'profile-link': 499, 'affliate': 18, 'Yext': 2, 'form': 1, 'email for scheduling.': 1, 'referral': 1, "email'": 1}
# Define threshold
threshold = 2.5
# Get total count
total_count = sum(count_utm_medium.values())
# Compute percentages and filter labels
labels = []
sizes = []
colors = ['#aebfff', '#80df56', '#009E73', '#F0E442', '#0072B2', '#00eacc', '#769d23']
for label, count in count_utm_medium.items():
percent = 100 * count / total_count
if percent >= threshold:
labels.append(f"{label}\n({percent:.1f}%)")
sizes.append(count)
if label == 'api':
colors.append('tab:blue')
elif label == 'organic':
colors.append('tab:orange')
elif label == 'email':
colors.append('tab:green')
else:
colors.append('tab:gray')
# Set up the plot
fig, ax = plt.subplots()
ax.axis('equal')
# Draw the pie chart
wedges, texts, _ = ax.pie(sizes, wedgeprops=dict(width=0.5), startangle=-40, autopct='', colors=colors)
# Set the labels and add a legend
ax.legend(wedges, labels, loc="best", bbox_to_anchor=(0.9, 0.9))
# Set the title
ax.set_title("UTM Medium Distribution")
# Set the title position and rotation
ax.title.set_position([1.05, 0.5])
ax.title.set_rotation(90)
plt.show()